{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import json\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import sentencepiece\n",
    "from ray.util.multiprocessing import Pool\n",
    "\n",
    "\n",
    "DATASET = \"../../dataset_processed/sharegpt_gpt4.json\"\n",
    "TOKENIZER = \"../../tokenizer/llama_tokenizer.model\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-05-24 20:04:38,940\tINFO worker.py:1625 -- Started a local Ray instance.\n"
     ]
    }
   ],
   "source": [
    "# Load dataset\n",
    "tokenizer = sentencepiece.SentencePieceProcessor(model_file=TOKENIZER)\n",
    "with open(DATASET, \"r\") as f:\n",
    "    dataset = json.load(f)\n",
    "\n",
    "# Parallel tokenization\n",
    "def _tokenize(sample):\n",
    "    for c in sample[\"items\"]:\n",
    "        c[\"value\"] = tokenizer.tokenize(c[\"value\"])\n",
    "\n",
    "    return sample\n",
    "\n",
    "dataset_tokenized = Pool().map(_tokenize, dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT_LEN = {\n",
    "    \"human\": 4,\n",
    "    \"gpt\": 5\n",
    "}\n",
    "\n",
    "dataset_len = np.array([sum(len(c[\"value\"]) + PROMPT_LEN[c[\"from\"]] for c in sample[\"items\"]) for sample in dataset_tokenized])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([4650.,  896.,  301.,  154.,   71.,   33.,   33.,   17.,   10.,\n",
       "          10.]),\n",
       " array([    0.,  5000., 10000., 15000., 20000., 25000., 30000., 35000.,\n",
       "        40000., 45000., 50000.]),\n",
       " <BarContainer object of 10 artists>)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGdCAYAAAAMm0nCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAiuklEQVR4nO3de3BU5cHH8V8u7BIuu+FiNkQSiYOCUcASNGy9vEVSVoxWK0zRUmUEtdDACFhulYLazoTBKgVBsKU1zlRF6AgqETATJFQJt2g04ZJqGxta3ASL2Q0UEiDP+4eTM6ygkhBInvj9zOwMOefZs895IJPvnOxZoowxRgAAABaJbu0JAAAANBUBAwAArEPAAAAA6xAwAADAOgQMAACwDgEDAACsQ8AAAADrEDAAAMA6sa09gQuloaFBBw8eVNeuXRUVFdXa0wEAAOfAGKPa2lolJSUpOvrrr7O024A5ePCgkpOTW3saAACgGQ4cOKDevXt/7f52GzBdu3aV9OUCeDyeVp4NAAA4F+FwWMnJyc7P8a/TbgOm8ddGHo+HgAEAwDLf9vYP3sQLAACsQ8AAAADrEDAAAMA6BAwAALAOAQMAAKxDwAAAAOsQMAAAwDoEDAAAsA4BAwAArEPAAAAA6xAwAADAOgQMAACwDgEDAACsQ8AAAADrxLb2BGzUZ3Zea0+hyT5dkNXaUwAAoMVwBQYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgnfMKmAULFigqKkpTp051th0/flzZ2dnq0aOHunTpolGjRqmqqirieZWVlcrKylKnTp2UkJCgGTNm6OTJkxFjtmzZosGDB8vtdqtv377Kzc09n6kCAIB2pNkBs2vXLj3//PMaOHBgxPZp06bpzTff1Jo1a1RYWKiDBw/q7rvvdvafOnVKWVlZqq+v17Zt2/Tiiy8qNzdX8+bNc8ZUVFQoKytLw4YNU0lJiaZOnaoHH3xQmzZtau50AQBAO9KsgDly5IjGjh2rP/7xj+rWrZuzPRQK6U9/+pOeeeYZ3XLLLUpPT9cLL7ygbdu2afv27ZKkt99+W3v37tVf/vIXXXvttRo5cqR+85vfaNmyZaqvr5ckrVixQqmpqXr66ad11VVXafLkyRo9erQWLVrUAqcMAABs16yAyc7OVlZWljIzMyO2FxcX68SJExHb+/fvr5SUFBUVFUmSioqKNGDAAPl8PmdMIBBQOBzWnj17nDFfPXYgEHCOcTZ1dXUKh8MRDwAA0D7FNvUJq1at0vvvv69du3adsS8YDMrlcik+Pj5iu8/nUzAYdMacHi+N+xv3fdOYcDisY8eOKS4u7ozXzsnJ0RNPPNHU0wEAABZq0hWYAwcO6JFHHtFLL72kjh07Xqg5NcucOXMUCoWcx4EDB1p7SgAA4AJpUsAUFxerurpagwcPVmxsrGJjY1VYWKglS5YoNjZWPp9P9fX1qqmpiXheVVWVEhMTJUmJiYln3JXU+PW3jfF4PGe9+iJJbrdbHo8n4gEAANqnJgXM8OHDVVpaqpKSEucxZMgQjR071vlzhw4dVFBQ4DynvLxclZWV8vv9kiS/36/S0lJVV1c7Y/Lz8+XxeJSWluaMOf0YjWMajwEAAL7bmvQemK5du+qaa66J2Na5c2f16NHD2T5hwgRNnz5d3bt3l8fj0ZQpU+T3+zV06FBJ0ogRI5SWlqb77rtPCxcuVDAY1Ny5c5WdnS232y1JmjhxopYuXaqZM2dq/Pjx2rx5s1avXq28vLyWOGcAAGC5Jr+J99ssWrRI0dHRGjVqlOrq6hQIBPTcc885+2NiYrR+/XpNmjRJfr9fnTt31rhx4/Tkk086Y1JTU5WXl6dp06Zp8eLF6t27t1auXKlAINDS0wUAABaKMsaY1p7EhRAOh+X1ehUKhVr8/TB9Ztt3JejTBVmtPQUAAL7Vuf785v9CAgAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWaVLALF++XAMHDpTH45HH45Hf79eGDRuc/cePH1d2drZ69OihLl26aNSoUaqqqoo4RmVlpbKystSpUyclJCRoxowZOnnyZMSYLVu2aPDgwXK73erbt69yc3Obf4YAAKDdaVLA9O7dWwsWLFBxcbF2796tW265RXfeeaf27NkjSZo2bZrefPNNrVmzRoWFhTp48KDuvvtu5/mnTp1SVlaW6uvrtW3bNr344ovKzc3VvHnznDEVFRXKysrSsGHDVFJSoqlTp+rBBx/Upk2bWuiUAQCA7aKMMeZ8DtC9e3c99dRTGj16tC655BK9/PLLGj16tCRp//79uuqqq1RUVKShQ4dqw4YNuv3223Xw4EH5fD5J0ooVKzRr1iwdOnRILpdLs2bNUl5ensrKypzXuOeee1RTU6ONGzee87zC4bC8Xq9CoZA8Hs/5nOIZ+szOa9HjXQyfLshq7SkAAPCtzvXnd7PfA3Pq1CmtWrVKR48eld/vV3FxsU6cOKHMzExnTP/+/ZWSkqKioiJJUlFRkQYMGODEiyQFAgGFw2HnKk5RUVHEMRrHNB7j69TV1SkcDkc8AABA+9TkgCktLVWXLl3kdrs1ceJErV27VmlpaQoGg3K5XIqPj48Y7/P5FAwGJUnBYDAiXhr3N+77pjHhcFjHjh372nnl5OTI6/U6j+Tk5KaeGgAAsESTA6Zfv34qKSnRjh07NGnSJI0bN0579+69EHNrkjlz5igUCjmPAwcOtPaUAADABRLb1Ce4XC717dtXkpSenq5du3Zp8eLFGjNmjOrr61VTUxNxFaaqqkqJiYmSpMTERO3cuTPieI13KZ0+5qt3LlVVVcnj8SguLu5r5+V2u+V2u5t6OgAAwELn/TkwDQ0NqqurU3p6ujp06KCCggJnX3l5uSorK+X3+yVJfr9fpaWlqq6udsbk5+fL4/EoLS3NGXP6MRrHNB4DAACgSVdg5syZo5EjRyolJUW1tbV6+eWXtWXLFm3atEler1cTJkzQ9OnT1b17d3k8Hk2ZMkV+v19Dhw6VJI0YMUJpaWm67777tHDhQgWDQc2dO1fZ2dnO1ZOJEydq6dKlmjlzpsaPH6/Nmzdr9erVysuz784fAABwYTQpYKqrq3X//ffrs88+k9fr1cCBA7Vp0yb98Ic/lCQtWrRI0dHRGjVqlOrq6hQIBPTcc885z4+JidH69es1adIk+f1+de7cWePGjdOTTz7pjElNTVVeXp6mTZumxYsXq3fv3lq5cqUCgUALnTIAALDdeX8OTFvF58BE4nNgAAA2uOCfAwMAANBaCBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHWaFDA5OTm67rrr1LVrVyUkJOiuu+5SeXl5xJjjx48rOztbPXr0UJcuXTRq1ChVVVVFjKmsrFRWVpY6deqkhIQEzZgxQydPnowYs2XLFg0ePFhut1t9+/ZVbm5u884QAAC0O00KmMLCQmVnZ2v79u3Kz8/XiRMnNGLECB09etQZM23aNL355ptas2aNCgsLdfDgQd19993O/lOnTikrK0v19fXatm2bXnzxReXm5mrevHnOmIqKCmVlZWnYsGEqKSnR1KlT9eCDD2rTpk0tcMoAAMB2UcYY09wnHzp0SAkJCSosLNTNN9+sUCikSy65RC+//LJGjx4tSdq/f7+uuuoqFRUVaejQodqwYYNuv/12HTx4UD6fT5K0YsUKzZo1S4cOHZLL5dKsWbOUl5ensrIy57Xuuece1dTUaOPGjec0t3A4LK/Xq1AoJI/H09xTPKs+s/Na9HgXw6cLslp7CgAAfKtz/fl9Xu+BCYVCkqTu3btLkoqLi3XixAllZmY6Y/r376+UlBQVFRVJkoqKijRgwAAnXiQpEAgoHA5rz549zpjTj9E4pvEYZ1NXV6dwOBzxAAAA7VOzA6ahoUFTp07VDTfcoGuuuUaSFAwG5XK5FB8fHzHW5/MpGAw6Y06Pl8b9jfu+aUw4HNaxY8fOOp+cnBx5vV7nkZyc3NxTAwAAbVyzAyY7O1tlZWVatWpVS86n2ebMmaNQKOQ8Dhw40NpTAgAAF0hsc540efJkrV+/Xlu3blXv3r2d7YmJiaqvr1dNTU3EVZiqqiolJiY6Y3bu3BlxvMa7lE4f89U7l6qqquTxeBQXF3fWObndbrnd7uacDgAAsEyTrsAYYzR58mStXbtWmzdvVmpqasT+9PR0dejQQQUFBc628vJyVVZWyu/3S5L8fr9KS0tVXV3tjMnPz5fH41FaWpoz5vRjNI5pPAYAAPhua9IVmOzsbL388st6/fXX1bVrV+c9K16vV3FxcfJ6vZowYYKmT5+u7t27y+PxaMqUKfL7/Ro6dKgkacSIEUpLS9N9992nhQsXKhgMau7cucrOznauoEycOFFLly7VzJkzNX78eG3evFmrV69WXp59d/8AAICW16QrMMuXL1coFNIPfvAD9erVy3m8+uqrzphFixbp9ttv16hRo3TzzTcrMTFRr732mrM/JiZG69evV0xMjPx+v372s5/p/vvv15NPPumMSU1NVV5envLz8zVo0CA9/fTTWrlypQKBQAucMgAAsN15fQ5MW8bnwETic2AAADa4KJ8DAwAA0BoIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADWIWAAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1mlywGzdulV33HGHkpKSFBUVpXXr1kXsN8Zo3rx56tWrl+Li4pSZmamPP/44Yszhw4c1duxYeTwexcfHa8KECTpy5EjEmI8++kg33XSTOnbsqOTkZC1cuLDpZwcAANqlJgfM0aNHNWjQIC1btuys+xcuXKglS5ZoxYoV2rFjhzp37qxAIKDjx487Y8aOHas9e/YoPz9f69ev19atW/Xwww87+8PhsEaMGKHLLrtMxcXFeuqpp/T444/rD3/4QzNOEQAAtDdRxhjT7CdHRWnt2rW66667JH159SUpKUmPPvqofvnLX0qSQqGQfD6fcnNzdc8992jfvn1KS0vTrl27NGTIEEnSxo0bddttt+nf//63kpKStHz5cj322GMKBoNyuVySpNmzZ2vdunXav3//Oc0tHA7L6/UqFArJ4/E09xTPqs/svBY93sXw6YKs1p4CAADf6lx/frfoe2AqKioUDAaVmZnpbPN6vcrIyFBRUZEkqaioSPHx8U68SFJmZqaio6O1Y8cOZ8zNN9/sxIskBQIBlZeX64svvmjJKQMAAAvFtuTBgsGgJMnn80Vs9/l8zr5gMKiEhITIScTGqnv37hFjUlNTzzhG475u3bqd8dp1dXWqq6tzvg6Hw+d5NgAAoK1qN3ch5eTkyOv1Oo/k5OTWnhIAALhAWjRgEhMTJUlVVVUR26uqqpx9iYmJqq6ujth/8uRJHT58OGLM2Y5x+mt81Zw5cxQKhZzHgQMHzv+EAABAm9SiAZOamqrExEQVFBQ428LhsHbs2CG/3y9J8vv9qqmpUXFxsTNm8+bNamhoUEZGhjNm69atOnHihDMmPz9f/fr1O+uvjyTJ7XbL4/FEPAAAQPvU5IA5cuSISkpKVFJSIunLN+6WlJSosrJSUVFRmjp1qn7729/qjTfeUGlpqe6//34lJSU5dypdddVVuvXWW/XQQw9p586deu+99zR58mTdc889SkpKkiT99Kc/lcvl0oQJE7Rnzx69+uqrWrx4saZPn95iJw4AAOzV5Dfx7t69W8OGDXO+boyKcePGKTc3VzNnztTRo0f18MMPq6amRjfeeKM2btyojh07Os956aWXNHnyZA0fPlzR0dEaNWqUlixZ4uz3er16++23lZ2drfT0dPXs2VPz5s2L+KwYAADw3XVenwPTlvE5MJH4HBgAgA1a5XNgAAAALoYW/RwYtF02XjWSuHIEADg7rsAAAADrEDAAAMA6BAwAALAOAQMAAKxDwAAAAOsQMAAAwDoEDAAAsA4BAwAArEPAAAAA6xAwAADAOgQMAACwDgEDAACsQ8AAAADrEDAAAMA6BAwAALAOAQMAAKxDwAAAAOsQMAAAwDoEDAAAsA4BAwAArEPAAAAA6xAwAADAOgQMAACwDgEDAACsQ8AAAADrEDAAAMA6BAwAALAOAQMAAKxDwAAAAOsQMAAAwDoEDAAAsA4BAwAArEPAAAAA6xAwAADAOgQMAACwDgEDAACsQ8AAAADrEDAAAMA6BAwAALAOAQMAAKxDwAAAAOsQMAAAwDoEDAAAsA4BAwAArBPb2hMAvkmf2XmtPYUm+3RBVmtPAQDaPa7AAAAA6xAwAADAOgQMAACwDgEDAACsQ8AAAADrEDAAAMA6BAwAALAOAQMAAKxDwAAAAOsQMAAAwDoEDAAAsA7/FxLQwvj/mwDgwuMKDAAAsA4BAwAArEPAAAAA6xAwAADAOgQMAACwDnchAeDOKQDWadNXYJYtW6Y+ffqoY8eOysjI0M6dO1t7SgAAoA1oswHz6quvavr06Zo/f77ef/99DRo0SIFAQNXV1a09NQAA0MqijDGmtSdxNhkZGbruuuu0dOlSSVJDQ4OSk5M1ZcoUzZ49+1ufHw6H5fV6FQqF5PF4WnRuNl5uB4Dm4Fd1uNjO9ed3m3wPTH19vYqLizVnzhxnW3R0tDIzM1VUVHTW59TV1amurs75OhQKSfpyIVpaQ93/WvyYANAWpUxb09pTaLKyJwKtPQWch8af2992faVNBsznn3+uU6dOyefzRWz3+Xzav3//WZ+Tk5OjJ5544oztycnJF2SOAIC2yfv71p4BWkJtba28Xu/X7m+TAdMcc+bM0fTp052vGxoadPjwYfXo0UNRUVEt9jrhcFjJyck6cOBAi/9qCpFY64uDdb44WOeLg3W+OC7kOhtjVFtbq6SkpG8c1yYDpmfPnoqJiVFVVVXE9qqqKiUmJp71OW63W263O2JbfHz8hZqiPB4P3xwXCWt9cbDOFwfrfHGwzhfHhVrnb7ry0qhN3oXkcrmUnp6ugoICZ1tDQ4MKCgrk9/tbcWYAAKAtaJNXYCRp+vTpGjdunIYMGaLrr79ev//973X06FE98MADrT01AADQytpswIwZM0aHDh3SvHnzFAwGde2112rjxo1nvLH3YnO73Zo/f/4Zv65Cy2OtLw7W+eJgnS8O1vniaAvr3GY/BwYAAODrtMn3wAAAAHwTAgYAAFiHgAEAANYhYAAAgHUImCZatmyZ+vTpo44dOyojI0M7d+5s7Sm1GVu3btUdd9yhpKQkRUVFad26dRH7jTGaN2+eevXqpbi4OGVmZurjjz+OGHP48GGNHTtWHo9H8fHxmjBhgo4cORIx5qOPPtJNN92kjh07Kjk5WQsXLjxjLmvWrFH//v3VsWNHDRgwQG+99VaLn29rycnJ0XXXXaeuXbsqISFBd911l8rLyyPGHD9+XNnZ2erRo4e6dOmiUaNGnfHBkJWVlcrKylKnTp2UkJCgGTNm6OTJkxFjtmzZosGDB8vtdqtv377Kzc09Yz7t9Xti+fLlGjhwoPNBXX6/Xxs2bHD2s8YXxoIFCxQVFaWpU6c621jr8/f4448rKioq4tG/f39nv5VrbHDOVq1aZVwul/nzn/9s9uzZYx566CETHx9vqqqqWntqbcJbb71lHnvsMfPaa68ZSWbt2rUR+xcsWGC8Xq9Zt26d+fDDD82PfvQjk5qaao4dO+aMufXWW82gQYPM9u3bzd/+9jfTt29fc++99zr7Q6GQ8fl8ZuzYsaasrMy88sorJi4uzjz//PPOmPfee8/ExMSYhQsXmr1795q5c+eaDh06mNLS0gu+BhdDIBAwL7zwgikrKzMlJSXmtttuMykpKebIkSPOmIkTJ5rk5GRTUFBgdu/ebYYOHWq+//3vO/tPnjxprrnmGpOZmWk++OAD89Zbb5mePXuaOXPmOGP++c9/mk6dOpnp06ebvXv3mmeffdbExMSYjRs3OmPa8/fEG2+8YfLy8szf//53U15ebn71q1+ZDh06mLKyMmMMa3wh7Ny50/Tp08cMHDjQPPLII8521vr8zZ8/31x99dXms88+cx6HDh1y9tu4xgRME1x//fUmOzvb+frUqVMmKSnJ5OTktOKs2qavBkxDQ4NJTEw0Tz31lLOtpqbGuN1u88orrxhjjNm7d6+RZHbt2uWM2bBhg4mKijL/+c9/jDHGPPfcc6Zbt26mrq7OGTNr1izTr18/5+uf/OQnJisrK2I+GRkZ5uc//3mLnmNbUV1dbSSZwsJCY8yX69qhQwezZs0aZ8y+ffuMJFNUVGSM+TI2o6OjTTAYdMYsX77ceDweZ21nzpxprr766ojXGjNmjAkEAs7X37XviW7dupmVK1eyxhdAbW2tueKKK0x+fr75v//7PydgWOuWMX/+fDNo0KCz7rN1jfkV0jmqr69XcXGxMjMznW3R0dHKzMxUUVFRK87MDhUVFQoGgxHr5/V6lZGR4axfUVGR4uPjNWTIEGdMZmamoqOjtWPHDmfMzTffLJfL5YwJBAIqLy/XF1984Yw5/XUax7TXv6dQKCRJ6t69uySpuLhYJ06ciFiD/v37KyUlJWKtBwwYEPHBkIFAQOFwWHv27HHGfNM6fpe+J06dOqVVq1bp6NGj8vv9rPEFkJ2draysrDPWg7VuOR9//LGSkpJ0+eWXa+zYsaqsrJRk7xoTMOfo888/16lTp874JGCfz6dgMNhKs7JH4xp90/oFg0ElJCRE7I+NjVX37t0jxpztGKe/xteNaY9/Tw0NDZo6dapuuOEGXXPNNZK+PH+Xy3XGf2b61bVu7jqGw2EdO3bsO/E9UVpaqi5dusjtdmvixIlau3at0tLSWOMWtmrVKr3//vvKyck5Yx9r3TIyMjKUm5urjRs3avny5aqoqNBNN92k2tpaa9e4zf5XAgC+XXZ2tsrKyvTuu++29lTapX79+qmkpEShUEh//etfNW7cOBUWFrb2tNqVAwcO6JFHHlF+fr46duzY2tNpt0aOHOn8eeDAgcrIyNBll12m1atXKy4urhVn1nxcgTlHPXv2VExMzBnvyq6qqlJiYmIrzcoejWv0TeuXmJio6urqiP0nT57U4cOHI8ac7Rinv8bXjWlvf0+TJ0/W+vXr9c4776h3797O9sTERNXX16umpiZi/FfXurnr6PF4FBcX9534nnC5XOrbt6/S09OVk5OjQYMGafHixaxxCyouLlZ1dbUGDx6s2NhYxcbGqrCwUEuWLFFsbKx8Ph9rfQHEx8fryiuv1CeffGLtv2cC5hy5XC6lp6eroKDA2dbQ0KCCggL5/f5WnJkdUlNTlZiYGLF+4XBYO3bscNbP7/erpqZGxcXFzpjNmzeroaFBGRkZzpitW7fqxIkTzpj8/Hz169dP3bp1c8ac/jqNY9rL35MxRpMnT9batWu1efNmpaamRuxPT09Xhw4dItagvLxclZWVEWtdWloaEYz5+fnyeDxKS0tzxnzTOn4XvycaGhpUV1fHGreg4cOHq7S0VCUlJc5jyJAhGjt2rPNn1rrlHTlyRP/4xz/Uq1cve/89N/ltv99hq1atMm632+Tm5pq9e/eahx9+2MTHx0e8K/u7rLa21nzwwQfmgw8+MJLMM888Yz744APzr3/9yxjz5W3U8fHx5vXXXzcfffSRufPOO896G/X3vvc9s2PHDvPuu++aK664IuI26pqaGuPz+cx9991nysrKzKpVq0ynTp3OuI06NjbW/O53vzP79u0z8+fPb1e3UU+aNMl4vV6zZcuWiFsi//e//zljJk6caFJSUszmzZvN7t27jd/vN36/39nfeEvkiBEjTElJidm4caO55JJLznpL5IwZM8y+ffvMsmXLznpLZHv9npg9e7YpLCw0FRUV5qOPPjKzZ882UVFR5u233zbGsMYX0ul3IRnDWreERx991GzZssVUVFSY9957z2RmZpqePXua6upqY4yda0zANNGzzz5rUlJSjMvlMtdff73Zvn17a0+pzXjnnXeMpDMe48aNM8Z8eSv1r3/9a+Pz+Yzb7TbDhw835eXlEcf473//a+69917TpUsX4/F4zAMPPGBqa2sjxnz44YfmxhtvNG6321x66aVmwYIFZ8xl9erV5sorrzQul8tcffXVJi8v74Kd98V2tjWWZF544QVnzLFjx8wvfvEL061bN9OpUyfz4x//2Hz22WcRx/n000/NyJEjTVxcnOnZs6d59NFHzYkTJyLGvPPOO+baa681LpfLXH755RGv0ai9fk+MHz/eXHbZZcblcplLLrnEDB8+3IkXY1jjC+mrAcNan78xY8aYXr16GZfLZS699FIzZswY88knnzj7bVzjKGOMafp1GwAAgNbDe2AAAIB1CBgAAGAdAgYAAFiHgAEAANYhYAAAgHUIGAAAYB0CBgAAWIeAAQAA1iFgAACAdQgYAABgHQIGAABYh4ABAADW+X9sek0zUsUJ6wAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(dataset_len, range=(0, 50000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5334"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(dataset_len < 8192)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "jax",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
